{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# LAB 04.01 - Cleaning Data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "!wget --no-cache -O init.py -q https://raw.githubusercontent.com/rramosp/ai4eng.v1/main/content/init.py\n", "import init; init.init(force_download=False); init.get_weblink()\n", "\n", "init.endpoint" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from local.lib.rlxmoocapi import submit, session\n", "session.LoginSequence(endpoint=init.endpoint, course_id=init.course_id, lab_id=\"L04.01\", varname=\"student\");" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "observe the following synthetic example with missing data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "from IPython.display import Image\n", "import numpy as np\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "n = 20\n", "place = np.r_[[\"Medellin\", \"Bogota\", \"Madrid\"]][(np.random.randint(3, size=n))]\n", "age = np.random.randint(50, size=n)+10\n", "children = np.r_[[(np.random.randint(2) if i<30 else (np.random.randint(4))) for i in age]]\n", "risk = np.r_[[np.random.random()*(.2 if i==\"Medellin\" else .8) for i in place]].round(3)\n", "risk[np.random.permutation(len(risk))[:5]]=np.nan" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | age | \n", "risk | \n", "children | \n", "place | \n", "
---|---|---|---|---|
0 | \n", "59 | \n", "0.112 | \n", "2 | \n", "Bogota | \n", "
1 | \n", "36 | \n", "0.093 | \n", "1 | \n", "Medellin | \n", "
2 | \n", "25 | \n", "0.638 | \n", "1 | \n", "Madrid | \n", "
3 | \n", "57 | \n", "NaN | \n", "1 | \n", "Madrid | \n", "
4 | \n", "59 | \n", "0.641 | \n", "0 | \n", "Madrid | \n", "
5 | \n", "50 | \n", "0.111 | \n", "3 | \n", "Bogota | \n", "
6 | \n", "58 | \n", "0.633 | \n", "2 | \n", "Bogota | \n", "
7 | \n", "13 | \n", "0.025 | \n", "0 | \n", "Medellin | \n", "
8 | \n", "10 | \n", "NaN | \n", "1 | \n", "Bogota | \n", "
9 | \n", "58 | \n", "0.299 | \n", "3 | \n", "Madrid | \n", "
10 | \n", "52 | \n", "0.004 | \n", "1 | \n", "Medellin | \n", "
11 | \n", "47 | \n", "0.2 | \n", "0 | \n", "Madrid | \n", "
12 | \n", "55 | \n", "NaN | \n", "3 | \n", "Bogota | \n", "
13 | \n", "44 | \n", "0.742 | \n", "3 | \n", "Madrid | \n", "
14 | \n", "27 | \n", "NaN | \n", "0 | \n", "Madrid | \n", "
15 | \n", "54 | \n", "0.567 | \n", "2 | \n", "Bogota | \n", "
16 | \n", "28 | \n", "0.028 | \n", "0 | \n", "Medellin | \n", "
17 | \n", "24 | \n", "0.299 | \n", "1 | \n", "Bogota | \n", "
18 | \n", "57 | \n", "NaN | \n", "0 | \n", "Medellin | \n", "
19 | \n", "40 | \n", "0.081 | \n", "2 | \n", "Medellin | \n", "